import pandas as pd
import time
import urllib.request, urllib.parse, urllib.error
import json
import ssl
from timezonefinder import TimezoneFinder
from datetime import datetime
df = pd.read_excel('1.xls', sheet_name="Laser Report 2014")
After some trial and error I decided to use the Google Maps API to collect Lat and Long data from each City listed in the Dataframe. This function returns the coordinates for a string input of town.
def getlatlon(location):
api_key = "Insert Your Own API Key Here"
google_url = "https://maps.googleapis.com/maps/api/geocode/json?"
parms = {}
parms['address'] = location
parms['key'] = api_key
url = google_url + urllib.parse.urlencode(parms)
uh = urllib.request.urlopen(url)
data = uh.read().decode()
try:
js = json.loads(data)
except:
js = None
try:
lat = js['results'][0]['geometry']['location']['lat']
lng = js['results'][0]['geometry']['location']['lng']
except:
lat = ""
lng = ""
return '{},{}'.format(lat, lng)
getlatlon('Washington, DC')
This converts the location to lower case as there were a few combinations of Upper and Lower case used, it then combines it into one string ready for passing to the lat/lon getter function.
df['location'] = df.CITY.str.lower() + ", " + df.STATE.str.lower()
df.head()
To avoid multiple API requests for the same location I created a frame of unique locations then passed that into a regular Python list ready for processing
locationlist = pd.DataFrame(df['location'].unique())
locationlist.columns = ['location']
locationlist = locationlist.dropna(axis=0)
location_pylist = locationlist['location'].values.tolist()
len(location_pylist)
The next two cells initialise an empty list and fill the list with dictionaries of location data. The function checks to see if the location is already in the list and if it isn't it goes searching for it.
I've commented the initialiser out to avoid accidentally erasing all the data, it is no longer necessary to run these cells as I saved the location data to a CSV to avoid using the location API over and over.
# location_list = []
counter = 0
for location in location_pylist:
found = False
for dic in location_list:
if location in dic.values():
found = True
# print(location + 'already found')
continue
if found == False :
print('Processing:' + location)
lat_long = getlatlon(location)
if lat_long != None:
if counter % 10 == 0 :
time.sleep(10)
location_dic = {}
loclist = lat_long.split(',')
lat = loclist[0]
lon = loclist[1]
location_dic['location'] = location
location_dic['lat'] = lat
location_dic['lon'] = lon
location_dic['zone'] = get_timezone(lat, lon)
location_list.append(location_dic)
counter += 1
# locations_frame = pd.DataFrame(location_list)
# locations_frame.to_csv(r'locations.csv', index = False, header=True)
This loads the location data I processed previously and saved.
loc_merge_frame = pd.read_csv('locations.csv')
loc_merge_frame.head()
location_list = loc_merge_frame['location'].values.tolist()
len(location_list)
We now import folium to visualise the location coordinates on a map.
import folium
locations = loc_merge_frame[['lat', 'lon']]
locations = locations.dropna()
locationlist = locations.values.tolist()
for location in locationlist:
if location[0] != '':
location[0] = float(location[0])
location[1] = float(location[1])
fol_map = folium.Map(location=[37.0902, -95.7129], zoom_start=4)
for i in range(len(locationlist)):
if locationlist[i][0] != '':
folium.CircleMarker(locationlist[i], radius=2).add_to(fol_map)
fol_map
We have the locations and lat/longs - what is needed is to merge this back into the original dataframe so that we have the coordinates for all the locations.
loc_merge_frame.head()
df = df.merge(loc_merge_frame, on='location')
df.drop('location', axis = 1, inplace = True)
df.head()
We now import the modules for KMeans clustering and view the elbow method to see the most optimal k
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline
loc_df = df[['lat', 'lon']]
loc_df.shape
loc_df = loc_df.dropna()
inertia_list = []
x = range(1,21)
for i in x:
kmeans = KMeans(n_clusters=i)
kmeans.fit(loc_df)
labels = kmeans.predict(loc_df)
centroids = kmeans.cluster_centers_
inertia_list.append(kmeans.inertia_)
plt.plot(x, inertia_list)
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
kmeans = KMeans(n_clusters=6)
kmeans.fit(loc_df)
labels = kmeans.predict(loc_df)
centroids = kmeans.cluster_centers_
for i in range(len(centroids)):
folium.CircleMarker(centroids[i], radius=5, color='red').add_to(fol_map)
fol_map
These clusters don't look too bad but the locations outside of the mainland USA are skewing the data, so I decided to filter these out leaving just those on the 48 states.
df_tester = df
df_tester.drop(df_tester[df_tester.lon < -125].index, inplace=True)
df_tester.drop(df_tester[df_tester.lon > -67].index, inplace=True)
df_tester.drop(df_tester[df_tester.lat < 24].index, inplace=True)
df_tester.drop(df_tester[df_tester.lat > 50].index, inplace=True)
df = df_tester
locations = df[['lat', 'lon']]
locations = locations.dropna()
locationlist = locations.values.tolist()
for location in locationlist:
if location[0] != '':
location[0] = float(location[0])
location[1] = float(location[1])
usa_map = folium.Map(location=[37.0902, -95.7129], zoom_start=4)
for i in range(len(locationlist)):
if locationlist[i][0] != '':
folium.CircleMarker(locationlist[i], radius=2).add_to(usa_map)
usa_map
inertia_list = []
x = range(1,21)
for i in x:
kmeans = KMeans(n_clusters=i)
kmeans.fit(locations)
labels = kmeans.predict(locations)
centroids = kmeans.cluster_centers_
inertia_list.append(kmeans.inertia_)
plt.plot(x, inertia_list)
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
kmeans = KMeans(n_clusters=4)
kmeans.fit(locations)
labels = kmeans.predict(locations)
centroids = kmeans.cluster_centers_
for i in range(len(centroids)):
folium.CircleMarker(centroids[i], radius=5, color='red').add_to(usa_map)
usa_map
These clusterings and locations look much more logical than before.
We now need to convert the Date and Time fields into a useable Pandas Timestamp and extract the Hour from the time.
df['pd_time'] = pd.to_datetime(df['DATE'].astype(str) + ' ' + df['TIME'].astype(str))
df['hour'] = df['pd_time'].dt.hour
df.head()
df['hour'].plot.hist(grid=True, bins=10, rwidth=0.9,
color='#607c8e', figsize=(12,10))
plt.title('Laser Attacks by UTC Hour')
plt.xlabel('UTC Hour')
plt.ylabel('Number of incidents')
plt.grid(axis='y', alpha=0.75)
We convert the altitude column to a numeric value
df['ALT'] = pd.to_numeric(df['ALT'], errors='coerce')
df['ALT'].plot.hist(grid=True, bins=50, rwidth=0.9,
color='#607c8e', figsize=(12,10))
plt.title('Laser Attacks by Altitude')
plt.xlabel('Altitude (feet)')
plt.ylabel('Number of incidents')
plt.xlim(right=20000)
plt.xlim(left=0)
plt.grid(axis='y', alpha=0.75)
We now group the States together and plot the top 20
df_state = df.groupby(['STATE']).size().reset_index(name='Count')
top20 = df_state.sort_values(by='Count', ascending=False).head(20)
top20 = top20.set_index('STATE')
top20.plot(kind='bar', figsize = (12,10), color='#607c8e')
plt.title('Laser Attacks by State')
plt.xlabel('')
plt.ylabel('Number of incidents')
plt.grid(axis='y', alpha=0.75)
df['pd_date'] = df['pd_time'].dt.date
df['day_of_week'] = df['pd_time'].dt.day_name()
df['dow_number'] = df['pd_time'].dt.dayofweek
df.head()
df['month'] = df['pd_time'].dt.month
We now count the number of attacks on each day of the year
df_date = pd.DataFrame(df['pd_date'].value_counts())
df_date = df_date.reset_index()
df_date.columns = ['date', 'count']
df_date = df_date.sort_values(by=['date'])
df_date = df_date.reset_index(drop=True)
df_date.plot(figsize = (12,10), color='#607c8e', kind='line')
plt.title('Laser Attacks across the Year')
plt.xlabel('Day of the Year')
plt.ylabel('Number of incidents')
plt.grid(axis='y', alpha=0.75)
This is then reduced to counts by month.
df_month = pd.DataFrame(df['month'].value_counts())
df_month = df_month.reset_index()
df_month.columns = ['month', 'count']
df_month = df_month.sort_values(by=['month'])
df_month = df_month.reset_index(drop=True)
df_month['count'].plot(figsize = (12,10), color='#607c8e', kind='line')
plt.title('Laser Attacks by Month')
plt.xlabel('Month of the Year')
plt.ylabel('Number of incidents')
plt.ylim(top=500)
plt.ylim(bottom=150)
plt.xlim(left=-1)
plt.xticks(np.arange(12), ('J', 'F', 'M', 'A', 'M', 'J', 'J', 'A', 'S', 'O', 'N', 'D'))
plt.grid(axis='y', alpha=0.75)
We count the number of attacks by day of the week to see if this has any affect.
df_dow = pd.DataFrame(df['day_of_week'].value_counts())
df_dow = df_dow.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
df_dow = df_dow.reset_index()
df_dow.columns = ['dow', 'count']
df_dow.head(7)
df_dow.plot(figsize = (12,10), color='#607c8e', kind='line')
plt.title('Laser Attacks by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of incidents')
plt.xticks(np.arange(7), ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))
plt.ylim(top=700)
plt.ylim(bottom=150)
plt.grid(axis='y', alpha=0.75)
We create a new dataframe that counts the number of attacks by each hour in each state.
df_clean = df[['hour', 'STATE']].copy()
df_clean = df_clean.dropna(axis=0)
df_clean = df_clean.groupby(['hour', 'STATE']).size().reset_index(name='Count')
df_clean.sort_values(by='Count', ascending=False).head(20)
For the analysis we create dummy variables for each State.
cat_columns = ['STATE']
df_clean = pd.get_dummies(df_clean, prefix_sep="_",
columns=cat_columns)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
import scipy
X = df_clean[[i for i in list(df_clean.columns) if i != 'Count']]
y = df_clean['Count']
df_clean.head()
We split the data into test and training sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
We create a KNN Regressor object and train the data, plotting the test vs predicted values
neigh = KNeighborsRegressor(n_neighbors = 2)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
df_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_results.plot(kind='scatter', x='Actual', y='Predicted')
df_results = df_results.reset_index(drop=True)
df_results.head(200).plot(figsize = (20,10))
plt.title('Comparing Predicted and Test Data')
plt.xlabel('Combination of Hour and State')
plt.ylabel('Number of incidents')
We now calculate the correlation between the actual and predicted values.
act = df_results['Actual']
pred = df_results['Predicted']
print('Correlation is: {}'.format(act.corr(pred)))
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(act, pred)
print('R^2 value: {}\nP Value: {}\nStandard Error: {}'.format(r_value, p_value, std_err))
We sort the list of attacks to show the highest combination of hour and State.
df_clean = df[['hour', 'STATE']].copy()
df_clean = df_clean.dropna(axis=0)
df_clean = df_clean.groupby(['hour', 'STATE']).size().reset_index(name='Count')
df_clean.sort_values(by='Count', ascending=False).head(15)